Data

The data for comes from the Elements of Statistical Learning textbook; and is originally from a study by Stamey et al. (1989) in which they examined the relationship between the level of prostate-specific antigen (psa) and a number of clinical measures in men who were about to receive a prostatectomy. The variables are as follows:

These data are available in prostate.csv. Let’s start by reading in the data and some basic EDA.

prostate = read.csv(".././data/prostate.csv")
head(prostate)
##       lcavol  lweight age      lbph svi       lcp gleason pgg45       lpsa
## 1 -0.5798185 2.769459  50 -1.386294   0 -1.386294       6     0 -0.4307829
## 2 -0.9942523 3.319626  58 -1.386294   0 -1.386294       6     0 -0.1625189
## 3 -0.5108256 2.691243  74 -1.386294   0 -1.386294       7    20 -0.1625189
## 4 -1.2039728 3.282789  58 -1.386294   0 -1.386294       6     0 -0.1625189
## 5  0.7514161 3.432373  62 -1.386294   0 -1.386294       6     0  0.3715636
## 6 -1.0498221 3.228826  50 -1.386294   0 -1.386294       6     0  0.7654678
##   train
## 1  TRUE
## 2  TRUE
## 3  TRUE
## 4  TRUE
## 5  TRUE
## 6  TRUE
summary(prostate)
##      lcavol           lweight           age             lbph        
##  Min.   :-1.3471   Min.   :2.375   Min.   :41.00   Min.   :-1.3863  
##  1st Qu.: 0.5128   1st Qu.:3.376   1st Qu.:60.00   1st Qu.:-1.3863  
##  Median : 1.4469   Median :3.623   Median :65.00   Median : 0.3001  
##  Mean   : 1.3500   Mean   :3.629   Mean   :63.87   Mean   : 0.1004  
##  3rd Qu.: 2.1270   3rd Qu.:3.876   3rd Qu.:68.00   3rd Qu.: 1.5581  
##  Max.   : 3.8210   Max.   :4.780   Max.   :79.00   Max.   : 2.3263  
##       svi              lcp             gleason          pgg45       
##  Min.   :0.0000   Min.   :-1.3863   Min.   :6.000   Min.   :  0.00  
##  1st Qu.:0.0000   1st Qu.:-1.3863   1st Qu.:6.000   1st Qu.:  0.00  
##  Median :0.0000   Median :-0.7985   Median :7.000   Median : 15.00  
##  Mean   :0.2165   Mean   :-0.1794   Mean   :6.753   Mean   : 24.38  
##  3rd Qu.:0.0000   3rd Qu.: 1.1787   3rd Qu.:7.000   3rd Qu.: 40.00  
##  Max.   :1.0000   Max.   : 2.9042   Max.   :9.000   Max.   :100.00  
##       lpsa           train        
##  Min.   :-0.4308   Mode :logical  
##  1st Qu.: 1.7317   FALSE:30       
##  Median : 2.5915   TRUE :67       
##  Mean   : 2.4784                  
##  3rd Qu.: 3.0564                  
##  Max.   : 5.5829
# Visualize the data (except the last column indicating the test/train split)
prostate_gg = prostate
prostate_gg$svi = as.factor(prostate_gg$svi)
ggpairs(prostate_gg, columns = 1:(ncol(prostate)-1))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

There are \(N=97\) observations with no missingness. We observe that:

We see that both lcavol and lcp seem to have a linear relationship with the response lpsa.

Split the data

# Split the data
train_df = data.frame(prostate[prostate$train==1,])
train_df = select(train_df, !c(train))
test_df = data.frame(prostate[prostate$train==0,])
test_df = select(test_df, !c(train))


N = dim(train_df)[1] #number of data points
D = dim(train_df) [2] -1 #number of features
Nnew = dim(test_df)[1]
print(paste("Number of data points =",N, " and number of features =", D))
## [1] "Number of data points = 67  and number of features = 8"

Fit linear regression

Let’s start by fitting a baseline linear regression model.

First, we will scale the inputs.

# Scale the data
train.m = apply(train_df, 2, mean)
train.s = apply(train_df,2,sd)
train_df = (train_df - matrix(train.m,N,D+1,byrow = TRUE))/matrix(train.s,N,D+1,byrow = TRUE)
test_df = (test_df - matrix(train.m,Nnew,D+1,byrow = TRUE))/matrix(train.s,Nnew,D+1,byrow = TRUE)

Fit the linear regression model and visualize the coefficients.

# Fit lm
lm_prostate = lm(lpsa~., data=train_df)
summary(lm_prostate)
## 
## Call:
## lm(formula = lpsa ~ ., data = train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.36503 -0.28272 -0.04491  0.37209  1.23095 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.200e-16  7.205e-02   0.000  1.00000    
## lcavol       5.931e-01  1.105e-01   5.366 1.47e-06 ***
## lweight      2.423e-01  8.808e-02   2.751  0.00792 ** 
## age         -1.180e-01  8.455e-02  -1.396  0.16806    
## lbph         1.755e-01  8.538e-02   2.056  0.04431 *  
## svi          2.563e-01  1.038e-01   2.469  0.01651 *  
## lcp         -2.393e-01  1.282e-01  -1.867  0.06697 .  
## gleason     -1.732e-02  1.180e-01  -0.147  0.88389    
## pgg45        2.296e-01  1.321e-01   1.738  0.08755 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5897 on 58 degrees of freedom
## Multiple R-squared:  0.6944, Adjusted R-squared:  0.6522 
## F-statistic: 16.47 on 8 and 58 DF,  p-value: 2.042e-12
tidy_reg_conf_int = tidy(lm_prostate,conf.int =TRUE)
tidy_reg_conf_int %>%
  filter(term != "(Intercept)") %>%
  # reorder the coefficients so that the largest is at the top of the plot
  mutate(term = fct_reorder(term, estimate)) %>%
  ggplot(aes(estimate, term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  # add in a dotted line at zero
  geom_vline(xintercept = 0, lty = 2) +
  labs(
    x = "Estimate of effect of variable on lpsa",
    y = NULL,
    title = "Coefficient plot with error bars") +
  theme_bw()

Compute the RMSE on the test data

yhat = predict(lm_prostate, test_df)
rmse_lm = sqrt(mean((yhat-test_df$lpsa)^2))
print(paste("The RMSE for the linear model is ",rmse_lm))
## [1] "The RMSE for the linear model is  0.597769431021197"

Lasso

Now, we will use glmnet to fit a lasso model. First, we need to use cross-validation to tune \(\lambda\).

set.seed (1)
X = as.matrix(train_df[,1:D])
y = train_df$lpsa
cv.out=cv.glmnet(X, y, alpha=1, standardize = FALSE)
plot(cv.out)

bestlam=cv.out$lambda.min
print(bestlam)
## [1] 0.006281436

Now, let’s refit with the best lambda

glmnet.out=glmnet(X, y, alpha=1, lambda = bestlam, standardize = FALSE)
glmnet.out$beta
## 8 x 1 sparse Matrix of class "dgCMatrix"
##                 s0
## lcavol   0.5728689
## lweight  0.2390965
## age     -0.1050754
## lbph     0.1683333
## svi      0.2434682
## lcp     -0.1981421
## gleason  .        
## pgg45    0.1953291

Add the lasso estimates to our plot:

tidy_reg_conf_int = tidy(lm_prostate,conf.int =TRUE)
tidy_reg_conf_int$lasso = as.matrix(predict(glmnet.out,type="coefficients",s=bestlam))
tidy_reg_conf_int %>%
  filter(term != "(Intercept)") %>%
  # reorder the coefficients so that the largest is at the top of the plot
  mutate(term = fct_reorder(term, estimate)) %>%
  ggplot(aes(estimate, term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  # add lasso estimates
  geom_point(aes(lasso, term),shape = 2, color = 'blue') +
  # add in a dotted line at zero
  geom_vline(xintercept = 0, lty = 2) +
  labs(
    x = "Estimate of effect of variable on lpsa",
    y = NULL,
    title = "Coefficient plot with error bars") +
  theme_bw()

Compute the RMSE on the test data

Xtest = as.matrix(test_df[,1:D])
yhat = predict(glmnet.out, s = bestlam, Xtest)
rmse_lasso = sqrt(mean((yhat-test_df$lpsa)^2))
print(paste("The RMSE for the lasso is ",rmse_lasso))
## [1] "The RMSE for the lasso is  0.587015428770515"

Ridge and Elastic Net

Exercise: Try to repeat for ridge and elastic net and compare the coefficients and RMSE.

Bayesian Lasso

Run a simple Gibbs sampling algorithm.

# Define prior parameters
priors = list(s=1, a_sigma = 2, b_sigma = 1, s2_0 = 10)

I = 2000 #number of iterations

# Initialize
w_init = lm_prostate$coefficients
sigma_init = mean(lm_prostate$residuals^2)

#Run MCMC
blasso_out = gibbs_blasso(X,y, priors, I, w_init, sigma_init)
## [1] "Number of iterations completed: 100"
## [1] "Number of iterations completed: 200"
## [1] "Number of iterations completed: 300"
## [1] "Number of iterations completed: 400"
## [1] "Number of iterations completed: 500"
## [1] "Number of iterations completed: 600"
## [1] "Number of iterations completed: 700"
## [1] "Number of iterations completed: 800"
## [1] "Number of iterations completed: 900"
## [1] "Number of iterations completed: 1000"
## [1] "Number of iterations completed: 1100"
## [1] "Number of iterations completed: 1200"
## [1] "Number of iterations completed: 1300"
## [1] "Number of iterations completed: 1400"
## [1] "Number of iterations completed: 1500"
## [1] "Number of iterations completed: 1600"
## [1] "Number of iterations completed: 1700"
## [1] "Number of iterations completed: 1800"
## [1] "Number of iterations completed: 1900"
## [1] "Number of iterations completed: 2000"

Check mixing and determine burnin (note here we started with good initial values!)

# Traceplots
traceplot(as.mcmc(blasso_out$w))

traceplot(as.mcmc(blasso_out$tau))

traceplot(as.mcmc(blasso_out$sigma))

# Check other diagnostics in coda

# Remove burnin
b = 1000
blasso_out$w = blasso_out$w[b:I,]
blasso_out$tau = blasso_out$tau[b:I,]
blasso_out$sigma = blasso_out$sigma[b:I]

Let’s compare the coefficents

# Compute posterior mean and HPD intervals
bl_what = apply(blasso_out$w,2,mean)
bl_wci = apply(blasso_out$w,2,function(x){HPDinterval(as.mcmc(x))})
  
tidy_reg_conf_int$ls = tidy_reg_conf_int$estimate 
tidy_reg_conf_int$estimate = bl_what
tidy_reg_conf_int$conf.low = bl_wci[1,]
tidy_reg_conf_int$conf.high = bl_wci[2,]

tidy_reg_conf_int %>%
  filter(term != "(Intercept)") %>%
  # reorder the coefficients so that the largest is at the top of the plot
  mutate(term = fct_reorder(term, estimate)) %>%
  ggplot(aes(estimate, term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  # add lasso estimates
  geom_point(aes(lasso, term),shape = 2, color = 'blue') +
  geom_point(aes(ls, term),shape = 4, color = 'red') +
  # add in a dotted line at zero
  geom_vline(xintercept = 0, lty = 2) +
  labs(
    x = "Estimate of effect of variable on lpsa",
    y = NULL,
    title = "Coefficient plot with error bars") +
  theme_bw()

Compute the RMSE:

yhat = cbind(rep(1,Nnew),Xtest)%*%bl_what
rmse_blasso = sqrt(mean((yhat-test_df$lpsa)^2))
print(paste("The RMSE for the bayesian lasso is ",rmse_blasso))
## [1] "The RMSE for the bayesian lasso is  0.588250333489908"

Exercises

  1. Try changing s and seeing how it affects the results. What is a reasonable value based on the estimated \(\lambda\) from lasso?
  2. Try changing the intialization to a random draw from the prior. How does this affect?
  3. Try adding a step to sample \(s\).
  4. Suppose \(N\) is relatively large. What can be done to help reduce the computational complexity?

Bayesreg

You can also try to compare with the package bayesreg, which implements a number of shrinkage priors.

library(bayesreg)
fit <- bayesreg(lpsa~., train_df, model = "gaussian", prior = "lasso", n.samples = 2000)

How do the results compare?

# Define prior parameters
traceplot(as.mcmc(t(fit$beta)))

traceplot(as.mcmc(t(fit$beta0)))

traceplot(as.mcmc(t(fit$sigma2)))

traceplot(as.mcmc(t(fit$tau2)))

# Check other diagnostics in coda

# Remove burnin
b = 1000
fit$beta = fit$beta[,b:I]
fit$beta0 = fit$beta0[b:I]
fit$sigma2 = fit$sigma2[b:I]
fit$tau2 = fit$tau2 [b:I]
# Compare scatter plot of samples from the two mcmc algorithms
coeff_names = names(train_df)[1:D]
ind1 = 1
ind2 = 6
df = data.frame(w1 = c(blasso_out$w[,ind1 +1], fit$beta[ind1,]), w2 = c(blasso_out$w[,ind2 +1], fit$beta[ind2,]), 
                mcmc = c(rep("Gibbs", dim(blasso_out$w)[1]), rep("HMC", dim(fit$beta)[2])))
ggplot(df) +
  geom_point(aes(x = w1, y =w2, color = mcmc)) + 
  theme_bw() +
  labs(x=coeff_names[ind1],y=coeff_names[ind2])